1 Setup

Set the working dir

setwd("/mnt/picea/projects/spruce/sjansson/seasonal-needles/u2015030")

Libraries

suppressPackageStartupMessages(library(amap))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(require(graphics))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(matrixStats))

Load table of counts vst transformed

load(file = "vst_aware.rda")

Focus on one year of sampling - samples past 2012-04-24 are the next generation of needles

sel <- ordered_dates %in% levels(ordered_dates)[1:(grep("2012-05",levels(ordered_dates))[1]-1)]
ordered_dates <- factor(as.character(ordered_dates)[sel])
vst_aware <- vst_aware[,sel]

Change the dates nomenclature

ddf <- do.call(rbind,strsplit(levels(ordered_dates),"-"))
labels <- ordered_dates
levels(labels) <- ifelse(as.integer(ddf[,3]) >= 25,
                         month.name[as.integer(ddf[,2])+1],
                         ifelse(as.integer(ddf[,3]) <= 10,
                                month.name[as.integer(ddf[,2])],
                                paste0("mid-",month.name[as.integer(ddf[,2])])
                         ))

plotLabels <- as.character(labels[match(levels(ordered_dates),ordered_dates)])
plotLabels[duplicated(plotLabels)] <- ""

Combine the replicates

vst_bio_rep_mean <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colMeans))

vst_bio_rep_median <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colMedians))
rownames(vst_bio_rep_median) <- rownames(vst_bio_rep_mean)

vst_bio_rep_sd <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colSds))

vst_bio_rep_lwr <- vst_bio_rep_mean-qt(0.975,3)*vst_bio_rep_sd/sqrt(3)

“2011-06-30” has no replicates, thus it is impossible to calculate sd, lwr and upr replacement of NA values in upr and lwr by the mean to be able to plot the confidence interval of the mean later

vst_bio_rep_lwr[,"2011-06-30"] <- vst_bio_rep_mean[,"2011-06-30"]
vst_bio_rep_lwr[vst_bio_rep_lwr < 0] <- 0

vst_bio_rep_upr <- vst_bio_rep_mean+qt(0.975,3)*vst_bio_rep_sd/sqrt(3)

vst_bio_rep_upr[,"2011-06-30"] <- vst_bio_rep_mean[,"2011-06-30"]

Scale the data

s.vst <- t(scale(t(vst_aware)))

s.vst.mean <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colMeans))

s.vst.median <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colMedians))
rownames(s.vst.median) <- rownames(s.vst.mean)

s.vst.sd <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colSds))

s.vst.lwr <- s.vst.mean-qt(0.975,3)*s.vst.sd/sqrt(3)

“2011-06-30” has no replicates, thus it is impossible to calculate sd, lwr and upr replacement of NA values in upr and lwr by the mean to be able to plot the confidence interval of the mean later

s.vst.lwr[,"2011-06-30"] <- s.vst.mean[,"2011-06-30"]
#s.vst.lwr[s.vst.lwr < 0] <- 0

s.vst.upr <- s.vst.mean+qt(0.975,3)*s.vst.sd/sqrt(3)

s.vst.upr[,"2011-06-30"] <- s.vst.upr[,"2011-06-30"]

read Gene Of Interest csv file

goi <- read.csv("~/Git/UPSCb/projects/spruce-needles/doc/GOI_list.csv", header = FALSE)

read GOI names file

goi_names <- read.csv("~/Git/UPSCb/projects/spruce-needles/doc/GOI_names.txt", header=FALSE)

Do a loop to plot the expression profile for each gene of the list

for (i in 1:nrow(goi)) {
  message(i)
  #check if goi are in vst with %in%
  if (goi[i,1] %in% rownames(vst_aware)) {
    g_median <- vst_bio_rep_median[rownames(vst_bio_rep_median) == goi[i,1]]
    g_mean <- vst_bio_rep_mean[rownames(vst_bio_rep_mean) == goi[i,1]]
    g_lwr <- vst_bio_rep_lwr[rownames(vst_bio_rep_lwr) == goi[i,1]]
    g_upr <- vst_bio_rep_upr[rownames(vst_bio_rep_upr) == goi[i,1]]
    DF <- data.frame(time=as.Date(levels(ordered_dates)),
                     mean=g_median,
                     lwr=g_lwr,
                     upr=g_upr)
    
    p <- ggplot(DF, aes(time, group = 1)) +
      annotate("rect", xmin=as.Date("2011-10-01"),
                    xmax=as.Date("2012-03-01"),
                    ymin=-Inf,ymax=+Inf,
                    fill="#ADD8E6",alpha=0.3) +
      geom_line(aes(y=mean), color="blue") +
      geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=0.3) + 
      scale_x_date(date_breaks = "month") +
      theme_bw() + coord_cartesian(ylim=c(0,max(vst_bio_rep_upr))) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
      labs(y = "vst counts", title = paste("Expression profile of", goi_names[i,1]))
    plot(p)
    suppressMessages(ggsave(filename = paste0("expression_profiles/",goi_names[i,1],"_expression_profile.jpeg")))
    
    DF <- data.frame(time=as.Date(levels(ordered_dates)),
                     mean=s.vst.mean[rownames(s.vst.mean) == goi[i,1]],
                     lwr=s.vst.lwr[rownames(s.vst.lwr) == goi[i,1]],
                     upr=s.vst.upr[rownames(s.vst.upr) == goi[i,1]])
    
    p <- ggplot(DF, aes(time, group = 1)) +
      annotate("rect", xmin=as.Date("2011-10-01"),
               xmax=as.Date("2012-03-01"),
               ymin=-Inf,ymax=+Inf,
               fill="#ADD8E6",alpha=0.3) +
      geom_line(aes(y=mean), color="blue") +
      geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=0.3) +
      scale_x_date(date_breaks = "month") +
      theme_bw() + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      labs(y = "standard score") +
      ggtitle(paste("Scaled expression profile of", goi_names[i,1]), 
              subtitle = paste("Average vst expression:",
                               round(mean(g_mean),digits=2)))
    plot(p)
    suppressMessages(ggsave(filename = paste0("expression_profiles/",goi_names[i,1],"_scaled_expression_profile.jpeg")))
    
  }
}
## 1

## 2

## 3

## 4

## 5

## 6

## 7

## 8

## 9

## 10

## 11

## 12

## 13

## 14

## 15

## 16

## 17

## 18

## 19

## 20

## 21

## 22

## 23

## 24

## 25

## 26

## 27

## 28

## 29

## 30

## 31

## 32

## 33

## 34

## 35

## 36

## 37

## 38

## 39

## 40

## 41

## 42

## 43

## 44

## 45

## 46

## 47

## 48

## 49

## 50

## 51

## 52

## 53

## 54

## 55

## 56

## 57

## 58

## 59

## 60

## 61

## 62

## 63

## 64

## 65

## 66

## 67

## 68

## 69

## 70

## 71

## 72
## 73

## 74

## 75

## 76

## 77

## 78

## 79

## 80

## 81

## 82

## 83

## 84

## 85

## 86

Remark ELIP_C does not match our data genes id ### Hierarchical clustering to see which genes have similar patterns Improve gene of interest data (add ELIP_C)

goi <- cbind(goi,goi_names)
colnames(goi) <- c("id","name")

select mean vst counts (of biological replicates for one time point) for gene of interest (goi)

vst_bio_rep_mean_goi <- vst_bio_rep_mean[rownames(vst_bio_rep_mean) %in% goi$id,]

replace gene ids by gene names as rownames

goi <- goi[! goi$name == "ELIP_C",]
goi <- goi[order(match(goi$id,rownames(vst_bio_rep_mean_goi))),]
rownames(vst_bio_rep_mean_goi) <- goi$name

Normalize data to obtain z-score to quantify only the variation around the mean (=pattern) and not the amplitude anymore for each each gene by doing so we can compare the genes relatively to their expression pattern and not relatively of their amplitude like before

vst_bio_rep_mean_goi_scaled <- t(scale(t(vst_bio_rep_mean_goi)))

Remove RabA1, which is not expressed

sel <- which(rowSums(is.na((vst_bio_rep_mean_goi_scaled))) > 0)
vst_bio_rep_mean_goi_scaled <- vst_bio_rep_mean_goi_scaled[-sel,]
vst_bio_rep_mean_goi <- vst_bio_rep_mean_goi[-sel,]

qqplot to check if our genes expression follows a Normal distribution or not

qqnorm(vst_bio_rep_mean_goi)
qqline(vst_bio_rep_mean_goi, col=3)

qqnorm(vst_bio_rep_mean_goi_scaled)
qqline(vst_bio_rep_mean_goi_scaled, col = 2)

==> does not follow a line, the distribution is not Normal ==> then Spearman correlation should be use because is not parametric Perform hierarchical clustering Firstly compute distance according to “spearman correlation” method Secondly compute clustering according to “complete” linkage

vst_bio_rep_mean_goi_cluster <- hcluster(vst_bio_rep_mean_goi, method = "spearman", link = "complete")
vst_bio_rep_mean_goi_scaled_cluster <- hcluster(vst_bio_rep_mean_goi_scaled, method = "spearman", link = "complete")

Plot the dendrogram

plot(vst_bio_rep_mean_goi_cluster, 
     main="Cluster Dendrogram of photosynthetic genes",
     cex=0.8)

plot(vst_bio_rep_mean_goi_scaled_cluster, 
     main="Cluster Dendrogram of photosynthetic genes",
     cex=0.8)

pdf(file="photosyntetic-genes-dendrogram.pdf",width=12,height=8)
plot(vst_bio_rep_mean_goi_scaled_cluster, 
     main="Cluster Dendrogram of photosynthetic genes",
     cex=0.8,xlab="photosyntetic genes",sub="spearman distance, complete linkage")
dev.off()
## png 
##   2

Plot the correlation

pdf(file = "expression_profiles/photosynthetic_genes_correlation_matrix.pdf")
corrplot(cor(t(vst_bio_rep_mean_goi)),method="square",order="hclust", 
         title = "Correlation matrix of photosynthetic genes",
         tl.cex=0.4, cl.cex=0.5, tl.col="black", addrect=2, is.corr = FALSE, mar = c(0,0,2,0))
dev.off()
## png 
##   2
corrplot(cor(t(vst_bio_rep_mean_goi_scaled)),method="square",order="hclust",
        tl.cex=0.4, cl.cex=0.5, tl.col="black", addrect=2, is.corr = FALSE)

2 Session Info

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] matrixStats_0.53.1 ggplot2_2.2.1      corrplot_0.84     
## [4] amap_0.8-14       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.16     knitr_1.20       magrittr_1.5     munsell_0.4.3   
##  [5] colorspace_1.3-2 rlang_0.2.0      stringr_1.3.0    plyr_1.8.4      
##  [9] tools_3.4.3      grid_3.4.3       gtable_0.2.0     htmltools_0.3.6 
## [13] yaml_2.1.18      lazyeval_0.2.1   rprojroot_1.3-2  digest_0.6.15   
## [17] tibble_1.4.2     evaluate_0.10.1  rmarkdown_1.9    labeling_0.3    
## [21] stringi_1.1.7    compiler_3.4.3   pillar_1.2.1     scales_0.5.0    
## [25] backports_1.1.2